
#######################################################################################################
# Autor: Jan Stuckatz
# Date: 2019/03/04
# Paper Title: Political Alignment between Firms and Employees in the United States: Evidence from a new Dataset
# Replication of Figures and Tables the Online Appendix
######################################################################################################



# set wd to where code is, if using RStudio
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))

# start log file
sink("log_appendix.txt")


# Load Necessary Packages
library(dplyr)
library(rio)
library(tidyr)
library(lfe)
library(stargazer)
library(xtable)



#########################################
########### Appendix Figures ############
########################################


##### Figure A1: by House/Senate/President
# House
load("R_shares.RDta") # read data
ic_RS_H <- ic_RS %>% filter(office_race == "H")


# Scatter Plot: IND vs PAC Rshare
pdf("figure_A1.pdf", width = 19, height = 6) # open graphics device
par(mfrow = c(1,3), mar = c(5,5.5,4,2), las = 1, cex.axis = 1.35, cex.lab = 1.6, cex.main = 2)
plot(ic_RS_H$Rshare_c, ic_RS_H$Rshare_i, pch = 16,col = "darkgrey", las = 0.8, font.lab = 1, axes=FALSE,
     main = "House Candidates",  ylim = c(0,1), xlim = c(0,1),
     xlab = "Firm REP Contribution Share", ylab = "Employee REP Contribution Share")
grid(col = "lightgrey", lty = "solid")
points(ic_RS_H$Rshare_c, ic_RS_H$Rshare_i, pch = 1,col = "black", las = 0.8, font.lab = 2,
       main = "",  ylim = c(0,1), xlim = c(0,1))
text(x = 0.1, y = 0.95, labels = expression(paste(beta, " = 0.64")), cex = 1.75)
axis(1, at = seq(0, 1, 0.2))
axis(2, at = seq(0, 1, 0.2))
abline(lm(ic_RS_H$Rshare_i ~ ic_RS_H$Rshare_c), col = "black", lwd = 2)



# Senate
ic_RS_S <- ic_RS %>% filter(office_race == "S")

# Scatter Plot: IND vs PAC Rshare
plot(ic_RS_S$Rshare_c, ic_RS_S$Rshare_i, pch = 16,col = "darkgrey", las = 0.8, font.lab = 1, axes=FALSE,
     main = "Senate Candidates",  ylim = c(0,1), xlim = c(0,1),
     xlab = "Firm REP Contribution Share", ylab = "Employee REP Contribution Share")
grid(col = "lightgrey", lty = "solid")
points(ic_RS_S$Rshare_c, ic_RS_S$Rshare_i, pch = 1,col = "black", las = 0.8, font.lab = 2,
       main = "",  ylim = c(0,1), xlim = c(0,1))
text(x = 0.1, y = 0.95, labels = expression(paste(beta, " = 0.55")), cex = 1.75)
axis(1, at = seq(0, 1, 0.2))
axis(2, at = seq(0, 1, 0.2))
abline(lm(ic_RS_S$Rshare_i ~ ic_RS_S$Rshare_c), col = "black", lwd = 2)



# President
ic_RS_P <- ic_RS %>% filter(office_race == "P")

# Scatter Plot: IND vs PAC Rshare
plot(ic_RS_P$Rshare_c, ic_RS_P$Rshare_i, pch = 16,col = "darkgrey", las = 0.8, font.lab = 1, axes=FALSE,
     main = "Presidential Candidates",  ylim = c(0,1), xlim = c(0,1),
     xlab = "Firm REP Contribution Share", ylab = "Employee REP Contribution Share")
grid(col = "lightgrey", lty = "solid")
points(ic_RS_P$Rshare_c, ic_RS_P$Rshare_i, pch = 1,col = "black", las = 0.8, font.lab = 2,
       main = "",  ylim = c(0,1), xlim = c(0,1))
text(x = 0.1, y = 0.95, labels = expression(paste(beta, " = 0.25")), cex = 1.75)
axis(1, at = seq(0, 1, 0.2))
axis(2, at = seq(0, 1, 0.2))
abline(lm(ic_RS_P$Rshare_i ~ ic_RS_P$Rshare_c), col = "black", lwd = 2)
dev.off() # close graphics device
rm(ic_RS_H, ic_RS_S, ic_RS_P, ic_RS) # clean up





#### Figure A2: Incumbents/Challengers/Open Seat
load("R_shares.RDta") # read data
ic_RS_I <- ic_RS %>% filter(office_race == "I")


# Incumbent
pdf("figure_A2.pdf", width = 19, height = 6) # open graphics device
par(mfrow = c(1,3), mar = c(5,5.5,4,2), las = 1, cex.axis = 1.35, cex.lab = 1.6, cex.main = 2)
plot(ic_RS_I$Rshare_c, ic_RS_I$Rshare_i, pch = 16,col = "darkgrey", las = 0.8, font.lab = 1, axes=FALSE,
     main = "Incumbent",  ylim = c(0,1), xlim = c(0,1),
     xlab = "Firm REP Contribution Share", ylab = "Employee REP Contribution Share")
grid(col = "lightgrey", lty = "solid")
points(ic_RS_I$Rshare_c, ic_RS_I$Rshare_i, pch = 1,col = "black", las = 0.8, font.lab = 2,
       main = "",  ylim = c(0,1), xlim = c(0,1))
text(x = 0.1, y = 0.95, labels = expression(paste(beta, " = 0.58")), cex = 1.75)
axis(1, at = seq(0, 1, 0.2))
axis(2, at = seq(0, 1, 0.2))
abline(lm(ic_RS_I$Rshare_i ~ ic_RS_I$Rshare_c), col = "black", lwd = 2)


## Challenger
ic_RS_C <- ic_RS %>% filter(office_race == "C")

plot(ic_RS_C$Rshare_c, ic_RS_C$Rshare_i, pch = 16,col = "darkgrey", las = 0.8, font.lab = 1,
     main = "Challenger",  ylim = c(0,1), xlim = c(0,1),
     xlab = "Firm REP Contribution Share", ylab = "Employee REP Contribution Share")
grid(col = "lightgrey", lty = "solid")
points(ic_RS_C$Rshare_c, ic_RS_C$Rshare_i, pch = 1,col = "black", las = 0.8, font.lab = 2,
       main = "",  ylim = c(0,1), xlim = c(0,1))
text(x = 0.1, y = 0.95, labels = expression(paste(beta, " = 0.35")), cex = 1.75)
axis(1, at = seq(0, 1, 0.2))
axis(2, at = seq(0, 1, 0.2))
abline(lm(ic_RS_C$Rshare_i ~ ic_RS_C$Rshare_c), col = "black", lwd = 2)


## Open Seat
ic_RS_O <- ic_RS %>% filter(office_race == "O")

plot(ic_RS_O$Rshare_c, ic_RS_O$Rshare_i, pch = 16,col = "darkgrey", las = 0.8, font.lab = 1,
     main = "Open Seat",  ylim = c(0,1), xlim = c(0,1),
     xlab = "Firm REP Contribution Share", ylab = "Employee REP Contribution Share")
grid(col = "lightgrey", lty = "solid")
points(ic_RS_O$Rshare_c, ic_RS_O$Rshare_i, pch = 1,col = "black", las = 0.8, font.lab = 2,
       main = "",  ylim = c(0,1), xlim = c(0,1))
text(x = 0.1, y = 0.95, labels = expression(paste(beta, " = 0.43")), cex = 1.75)
axis(1, at = seq(0, 1, 0.2))
axis(2, at = seq(0, 1, 0.2))
abline(lm(ic_RS_O$Rshare_i ~ ic_RS_O$Rshare_c), col = "black", lwd = 2)

dev.off() # close graphics device
rm(ic_RS_C, ic_RS_I, ic_RS_O, ic_RS) # clean up





##### Figure A3, Most and Least aligned Firms/Occupations
# load individual level data
load("main_results_ind.RDta")


# get Occupation names from O*NET CrossWalk File
occ_crossw <- rio::import("soc_2010_titles.csv")

# get mean alignment scores by occupation,
# limit to occupations with min 100 observations,
# and join occupation names
occ_align <- pc_don %>%
  group_by(soc_2010) %>% # group by occupation
  summarise(Align = mean(Align, na.rm = TRUE), n = n())  %>% # mean alignment
  arrange(-Align) %>% # order by alignment score
  filter(n >=100) %>%  # exclude occupations with few observations
  left_join(occ_crossw, by = "soc_2010") %>% # match occupation names
  select(soc_2010, soc_2010_title, Align, n) %>% na.omit()

# create Table with Top 10 and Bottom 10 Occupations
occ_align <- rbind(tail(occ_align, 10), head(occ_align, 10))
colnames(occ_align) <- c("SOC 2010 Code", "SOC Name", "Align", "n")
occ_align <- occ_align[order(occ_align$Align),]



# match corresponding occupation names in short format for nicer looking plot
occ_align$`SOC Name` <- c("Editors", "Flight Attendants", "Counter Attendants", "Web Developers", "Database Administrators",
                          "Chemists", "Tax Preparers", "Statisticians", "Registered Nurses",
                          "Market Research Analysts",
                          "PR & Fundraising Manager", "Financial Managers", "Sales Managers", "Sales Representatives",
                          "PR Specialists", "General Managers", "Political Scientists", "Engineering Technicians",
                          "Geoscientists", "Chief Executives")


# plot and output to file
pdf("figure_A3a.pdf", width = 7, height = 6)
par(mfrow = c(1,1), cex.lab = 1.5, cex.axis = 1.25, mar = c(4.5,13.5,2,2), cex.main = 1.6)
x <- barplot(occ_align$Align, xlab="Alignment", names.arg = occ_align$`SCO Name`, horiz = TRUE, las = 1, xlim = c(0,1.15),
             main = "10 Most & Least Aligned Occupations", col = "grey55")
grid(col = "lightgrey", lty = "solid")
barplot(occ_align$Align, xlab="Alignment", names.arg = occ_align$`SOC Name`, horiz = TRUE, las = 1, xlim = c(0,1.15),
        main = "10 Most & Least Aligned Occupations", add = TRUE, col ="grey55")
abline(h = 12.085, lwd = 3)
arrows(x0 = 1, y0 = 12.085, x1 = 1, y1 = 15, length = 0.25,
       code = 2, col = "black", lwd = 2)
arrows(x0 = 1, y0 = 12.085, x1 = 1, y1 = 9.085, length = 0.25,
       code = 2, col = "black", lwd = 2)
text(x = 1, y = 15.5, labels = "Most Aligned", cex = 1.1)
text(x = 1, y = 8.4, labels = "Least Aligned", cex = 1.1)
dev.off()
rm(occ_align, x) # clean up
rm(occ_crossw, pc_don)



# load CSV with firm names
firm_names <- read.csv("firmnames.csv", stringsAsFactors = FALSE)

# load individual level data
load("main_results_ind.RDta")

# prepare data
gvkey_align <- pc_don %>% group_by(gvkey) %>%
  summarise(Align = mean(Align, na.rm = TRUE), n = n())  %>% # get mean alignment scores by firm
  arrange(-Align) %>% # order by alignment
  filter(n >=100) %>% # limit to firms with min 100 observations
  left_join(firm_names, by = "gvkey") %>% # and join firm names
  select(gvkey,conm, Align, n) %>% na.omit() # remove NAs

# create Table with top 10 and bottom 10 firms
gvkey_align <- rbind(tail(gvkey_align, 10), head(gvkey_align, 10))
colnames(gvkey_align) <- c("gvkey", "Company Name", "Align", "n")
gvkey_align <- gvkey_align[order(gvkey_align$Align),] # order



# match corresponding firm names in short format for plot, for better looking plot
gvkey_align$`Company Name` <- rev(c("Timken", "Marathon Petroleum", " Devon Energy", "Energy Transfer Partners",
                                    "Scana","Occidental Petroleum", "Anadarko Petroleum", "Chesapeake Energy", "Deere",
                                    "Apache",
                                    "Teradata", "Harley-Davidson", "Acxiom", "Meredith", "Netflix",
                                    "Brunswick", "Werner Enterprises", "Broadcom" , "J.P. Morgan", "Time"))

# plot and output figure
pdf("figure_A3b.pdf", width = 7, height = 6)
par(mfrow = c(1,1), cex.lab = 1.5, cex.axis = 1.25, mar = c(4.5,12.5,2,1), cex.main = 1.6)
x <- barplot(gvkey_align$Align, xlab="Alignment", names.arg = gvkey_align$`Company Name`, horiz = TRUE, las = 1, xlim = c(0,1.15),
             main = "10 Most & Least Aligned Firms", col = "grey55")
grid(col = "lightgrey", lty = "solid")
barplot(gvkey_align$Align, xlab="Alignment", names.arg = gvkey_align$`Company Name`, horiz = TRUE, las = 1, xlim = c(0,1.15),
        main = "10 Most & Least Aligned Firms", add = TRUE, col ="grey55")
abline(h = 12.085, lwd = 3)
arrows(x0 = 1, y0 = 12.085, x1 = 1, y1 = 15, length = 0.25,
       code = 2, col = "black", lwd = 2)
arrows(x0 = 1, y0 = 12.085, x1 = 1, y1 = 9.085, length = 0.25,
       code = 2, col = "black", lwd = 2)
text(x = 1, y = 15.5, labels = "Most Aligned", cex = 1.1)
text(x = 1, y = 8.4, labels = "Least Aligned", cex = 1.1)
dev.off()
rm(gvkey_align, x, firm_names) # clean up





#### Figure A4
# Replicate Figure 4b with 60 matching score for occupation
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
load("app_results_ind_match_60.RDta")

pac_ceo <- pc_don  %>%  group_by(gvkey, soc) %>% summarise(Align = mean(Align, na.rm = TRUE), n = n()) %>%
  mutate(CEO = ifelse(soc == "11-1011.00", 1, 0))

pdf("figure_A4.pdf", width = 7, height = 8)
par(mfrow = c(1,1), mar = c(5,5.5,2,1), las = 1, cex.axis = 1.35, cex.lab = 1.6, cex.main = 1.8)

boxplot(Align ~ CEO, data = pac_ceo, horizontal = FALSE, axes=FALSE,  # be careful, order of categories is reversed when TRUE
        las = 1, outpch=21, outbg = "darkgrey", outcex = 0.65, col = "grey55", boxwex = 0.25, lwd = 1.35,
        ylim = c(0, 1), xlim = c(0.75, 2.25), xaxt = "n", ylab = "", xlab = "")
title(ylab = "Alignment (firm mean)", xlab = "Occupation", font.lab = 2)
abline(h = 0.5, lwd = 2, lty = "dashed", col = "lightgrey")
grid(col = "lightgrey", lty = "solid")
boxplot(Align ~ CEO, data = pac_ceo, horizontal = FALSE, axes=FALSE, # be careful, order of categories is reversed when TRUE
        las = 1, outpch=21, outbg = "darkgrey", outcex = 0.65, col = "grey55", add = TRUE, boxwex = 0.25, lwd = 1.35,
        ylim = c(0, 1), xlim = c(0.75, 2.25),xaxt = "n")
axis(1, at = c(1,2), labels = c("Other Employees", "Chief Executives"))
axis(2, at = seq(0, 1, 0.2))
dev.off()
rm(pac_ceo, pc_don) # clean up


#### Figure A5

# Replicate Figure 2 while excluding all extreme values, 0 and 1
load("R_shares.RDta") # read data, appendix
ic_RS <- ic_RS %>% filter(office_race == "all_appendix")

# Scatter Plot: IND vs PAC Rshare, no 1s and 0s
pdf("figure_A5.pdf", width = 10, height = 8)
par(mfrow = c(1,1), mar = c(5,5.5,4,2), las = 1, cex.axis = 1.35, cex.lab = 1.6, cex.main = 1.8)
plot(ic_RS$Rshare_c, ic_RS$Rshare_i, pch = 16,col = "darkgrey", las = 0.8, font.lab = 2, axes=FALSE,
     main = "",  ylim = c(0,1), xlim = c(0,1),
     xlab = "Firm REP Contribution Share", ylab = "Employee REP Contribution Share")
grid(col = "lightgrey", lty = "solid")
points(ic_RS$Rshare_c, ic_RS$Rshare_i, pch = 1,col = "black", las = 0.8, font.lab = 2,
       main = "",  ylim = c(0,1), xlim = c(0,1))
text(x = 0.1, y = 0.95, labels = expression(paste(beta, " = 0.75")), cex = 1.75) # R2 = 0.27
axis(1, at = seq(0, 1, 0.2))
axis(2, at = seq(0, 1, 0.2))
abline(lm(ic_RS$Rshare_i ~ ic_RS$Rshare_c), col = "black", lwd = 2)
dev.off()
rm(ic_RS) # clean up




#############################################
######## Appendix Descriptive Tables ########
#############################################



#### Table A2, Company Counts
# load data
load("ind_dv.RDta")

# Function for Bold Column Names
bold <- function(x) {
  paste0('{\\bfseries ', x, '}')
}



naics_2012 <- read.csv("naics_2012_titles.csv", stringsAsFactors = FALSE, sep = ",")[,1:2] # codes


# aggregate by naics code and company
container <- ind_dv %>% 
  group_by(conm, naics6) %>% # group by company name  and industry
  summarise(Frequency = n()) %>% # create frequencies
  arrange(desc(Frequency)) %>% # descending order
  left_join(naics_2012, by = c("naics6" = "naics_2012")) %>% # add industry titles
  filter(!is.na(conm)) %>% 
  select("Company Name" = conm, "NAICS Code"=naics6, "NAICS Title" = naics_2012_title, "Frequency"=Frequency)

# change names for table
container$`NAICS Title`[is.na(container$`NAICS Title`)] <- "Unknown/Other" # some have no Title for their NAICS
container$`NAICS Title` <- gsub("T $", "", container$`NAICS Title`, fixed = FALSE) # make shorter title for table
container$`NAICS Title` <- gsub(" and Related Activities", "", container$`NAICS Title`, fixed = FALSE) # same

# output to Table
print(xtable::xtable(container[1:40,], caption = "\\textbf{Most frequent firms in linked Firm-Employee Campaign Contributions Data}. The table shows the distribution of 40 most common firms in the linked employer-employee data, their matched North American Industrial Classification System (NAICS) code, as well as their industry title.", label = "firm_count"),
      include.rownames = FALSE, file = "table_A2.tex",
      caption.placement = "bottom", booktabs = TRUE, table.placement = "H", sanitize.colnames.function = bold)





##### Table A3, Industry Counts
# aggregate by naics code
container <- ind_dv %>% 
  group_by(naics3) %>% # group by industry
  summarise(Frequency = n()) %>% # create frequencies
  arrange(desc(Frequency)) %>% # descending order
  left_join(naics_2012, by = c("naics3" = "naics_2012")) %>% # add titles
  filter(!is.na(naics3)) %>%
  select("NAICS Code" = naics3, "NAICS Title"=naics_2012_title, "Frequency"=Frequency)

container$`NAICS Title`[is.na(container$`NAICS Title`)] <- "Unknown/Other" # some no Title for their NAICS
container$`NAICS Title` <- gsub("T $", "", container$`NAICS Title`, fixed = FALSE) # make shorter title for table
container$`NAICS Title` <- gsub(" and Related Activities", "", container$`NAICS Title`, fixed = FALSE) # same

# output table
print(xtable::xtable(container[1:40,], caption = "\\textbf{Most frequent Industries in linked Firm-Employee Campaign Contributions Data}. The table shows the distribution of 40 most frequent North American Industrial Classification
System (NAICS) 3-digit industries in the linked employer-employee data.", label = "naics3_count"),
      include.rownames = FALSE, file = "table_A3.tex",
      caption.placement = "bottom", booktabs = TRUE, table.placement = "H", sanitize.colnames.function = bold)




### Table A5, Occupation Counts
# unique occupation names
occ_soc<- rio::import("soc_2010_titles.csv")

# aggregate to occupation-lvl
container <- ind_dv %>%
  group_by(soc_2010) %>% # group by soc code
  summarise(Frequency = n()) %>% # generate count
  arrange(desc(Frequency)) %>% # descending order
  filter(!is.na(soc_2010)) %>% # exclude no soc codes
  left_join(occ_soc, by = c("soc_2010" = "soc_2010"))  %>% #joint titles
  select("SOC 2010"=soc_2010, "SOC 2010 Title" = soc_2010_title, "Frequency"=Frequency) # rename



# Table for which occupations matched
print(xtable::xtable(container[1:40,], caption = "\\textbf{Unequal Frequency of Occupations.in Firm-Employee Campaign Contributions Data}. The table shows the distribution of 40 most common Standardized Occupation Classification
(SCO) codes in the linked firm-employee contributions data. The table shows that Management,
Business, Financial, and Legal occupations comprise more than half of the individual contributions.", label = "occ_count"),
      include.rownames = FALSE, file = "table_A5.tex",
      caption.placement = "bottom", booktabs = TRUE, table.placement = "H", sanitize.colnames.function = bold)

rm(container, ind_dv, naics_2012, occ_soc) # clean up




# HOW REPRESENTATIVE IS THE DATA: INDUSTRIES AND OCCUPATIOS
load("ind_contr_soc_naics.RDta")

#### Table A4: NAICS/SOC FEC individuals vs. Economy
naics2don <- ind_naics_soc %>%
  group_by(naics2) %>% # group by industry
  summarise(n = n()) # generate counts

naics2don$naics2[naics2don$naics2 %in% c("31", "32", "33")] <- "31--33"
naics2don$naics2[naics2don$naics2 %in% c("44", "45")] <- "44--45"
naics2don$naics2[naics2don$naics2 %in% c("48", "49")] <- "48--49"
naics2don <- naics2don %>%
  group_by(naics2) %>% # group by industry
  summarise(n = sum(n)) %>% # counts
  mutate(perc = round(n/sum(n)*100, digits = 1)) #calculate percentages

# NAICS2 now get the titles
naics_2012 <- read.csv("naics_2012_titles.csv", stringsAsFactors = FALSE, sep = ",")
naics2don <- left_join(naics2don, naics_2012, by = c("naics2" = "naics_2012"))
naics2don$naics_2012_title <- gsub("T$", "", naics2don$naics_2012_title, fixed = FALSE)
naics2don$naics_2012_title <- gsub(" and Related Activities", "", naics2don$naics_2012_title, fixed = FALSE)


# comparison with NAICS2-digit
naics3dig <- rio::import("naics3totemp.csv") # load employment data from BLS
naics3dig$naics2 <- substr(naics3dig$NAICS, 1, 2) # create 2-digit NAICS Code

naics3dig$naics2[naics3dig$naics2 %in% c("31", "32", "33")] <- "31--33"
naics3dig$naics2[naics3dig$naics2 %in% c("44", "45")] <- "44--45"
naics3dig$naics2[naics3dig$naics2 %in% c("48", "49")] <- "48--49"


# Aggregate employment data at 2 digit level and create shares
naics2 <- naics3dig %>% group_by(naics2) %>%
  summarise(TOT_EMP = sum(TOT_EMP, na.rm = TRUE)) %>%
  mutate(perc_real = round(TOT_EMP/sum(naics3dig$TOT_EMP)*100, digits = 1))

# # join donations and employment data, calculate difference in shares, and create nice column names
naics2 <- right_join(naics2, naics2don, by = c("naics2")) %>%
  mutate(dif = perc - perc_real) %>%
  select("NAICS Code" = naics2,
         "Industry Name" = naics_2012_title,
         "\\% 2016 US Employment" = perc_real,
         "\\% FEC Filings" = perc,
         "Difference" = dif)

naics2$`Industry Name`[naics2$`NAICS Code`=="99"] <- "Unkown" # some are unknown
naics2 <- naics2 %>%
  na.omit() %>%
  arrange(desc(Difference)) # sort by size of difference

print(xtable::xtable(naics2, caption = "\\textbf{Differences between US Industry Employment and FEC Industry Filings}. The table shows that there are large differences between 2016 US private Employment across
2-digit North American Industry Classification System (NAICS) industries and Filings per Industry
in the FEC data. Source: Bureau for Labor Statistics and own calculations.", label = "occ_count", digits = 1),
      include.rownames = FALSE, file = "table_A4.tex",
      caption.placement = "bottom", booktabs = TRUE, table.placement = "H", sanitize.colnames.function = bold)





# Table A6: FEC Occupations  vs Economy
ind_naics_soc$soc2d <- paste0(substr(ind_naics_soc$soc, 1, 2), "-0000") # create 2-digit SOC with ending 0s
soc2don <- ind_naics_soc %>%
  group_by(soc2d) %>%
  summarise(n = sum(n())) %>%
  mutate(perc = round(n/sum(n)*100, digits = 1))

# actual employment totals
soc2d <- read.csv("soc2totemp.csv", stringsAsFactors = FALSE)

# create percentage employments
soc2d <- soc2d %>%
  group_by(OCC_CODE) %>% # group by occupation
  mutate(perc_real = round(TOT_EMP/sum(soc2d$TOT_EMP)*100, digits = 1)) %>% # get percentages
  select(-TOT_EMP) # exclude

# join donations and employment data, calculate difference in shares, and create nice column names
soc2 <- right_join(soc2don, soc2d, by = c("soc2d" = "OCC_CODE")) %>%
  mutate(dif = perc - perc_real) %>% # create difference
  select("SOC Code" = soc2d, # rename column names
         "SOC Name" = OCC_TITLE,
         "\\% 2016 US Employment" = perc_real,
         "\\% FEC Filings" = perc,
         "Difference" = dif)

soc2 <- soc2 %>% arrange(desc(Difference)) # sort by size of difference

print(xtable::xtable(soc2, caption = "\\textbf{Differences between US Occupational Employment and FEC Occupation Filings}. The table shows that there are large differences between 2016 US private Employment across
2-digit Standard Occupation Classification (SOC) categories and Filings per occupation in the FEC
data. Source: Bureau for Labor Statistics and own calculations.", label = "occ2_diff", digits = 1),
      include.rownames = FALSE, file = "table_A6.tex",
      caption.placement = "bottom", booktabs = TRUE, table.placement = "H", sanitize.colnames.function = bold)




# Table A7: Cross-Validation of 1000 Employer and Occupation names
cv_emp <- read.csv("cv_replic_tab.csv", stringsAsFactors = FALSE) # input results of CV
cv_emp$acc <- NA # Accuracy
cv_emp$pre <- NA # Precision
cv_emp$rec <- NA # Reacall
cv_emp$f_1 <- NA # F1
cv_emp$threshold <- paste0("0.",cv_emp$threshold) # as string for output

### Employers
# Threshold = 81
cv_emp$acc[1] <- round(cv_emp$correct[1]/1000, 2)
cv_emp$pre[1] <- round(cv_emp$correct_m[1]/(cv_emp$correct_m[1]+cv_emp$incorrect_m[1]), 2)
cv_emp$rec[1] <- round(cv_emp$correct_m[1]/(cv_emp$correct_m[1]+cv_emp$incorrect_nm[1]), 2)
cv_emp$f_1[1] <- round(2*((cv_emp$pre[1]*cv_emp$rec[1])/(cv_emp$pre[1]+cv_emp$rec[1])), 2)


# Threshold = 70
cv_emp$acc[2] <- round(cv_emp$correct[2]/1000, 2)
cv_emp$pre[2] <- round(cv_emp$correct_m[2]/(cv_emp$correct_m[2]+cv_emp$incorrect_m[2]), 2)
cv_emp$rec[2] <- round(cv_emp$correct_m[2]/(cv_emp$correct_m[2]+cv_emp$incorrect_nm[2]), 2)
cv_emp$f_1[2] <- round(2*((cv_emp$pre[2]*cv_emp$rec[2])/(cv_emp$pre[2]+cv_emp$rec[2])), 2)


# Threshold = 60
cv_emp$acc[3] <- round(cv_emp$correct[3]/1000, 2)
cv_emp$pre[3] <- round(cv_emp$correct_m[3]/(cv_emp$correct_m[3]+cv_emp$incorrect_m[3]), 2)
cv_emp$rec[3] <- round(cv_emp$correct_m[3]/(cv_emp$correct_m[3]+cv_emp$incorrect_nm[3]), 2)
cv_emp$f_1[3] <- round(2*((cv_emp$pre[3]*cv_emp$rec[3])/(cv_emp$pre[3]+cv_emp$rec[3])), 2)


# put in row with NAs, beginning and end, for nicer-looking table
cv_emp <- rbind(rep(NA, 9), cv_emp[c(1:3),], rep(NA, 9), cv_emp[c(4:6),])
cv_emp$threshold[1] <- "Employers"
cv_emp$threshold[5] <- "Occupations"
cv_emp$type <- NULL


### Occupations
# Threshold = 72
cv_emp$acc[6] <- round(cv_emp$correct[6]/1000, 2)
cv_emp$pre[6] <- round(cv_emp$correct_m[6]/(cv_emp$correct_m[6]+cv_emp$incorrect_m[6]), 2)
cv_emp$rec[6] <- round(cv_emp$correct_m[6]/(cv_emp$correct_m[6]+cv_emp$incorrect_nm[6]), 2)
cv_emp$f_1[6] <- round(2*((cv_emp$pre[6]*cv_emp$rec[6])/(cv_emp$pre[6]+cv_emp$rec[6])), 2)


# Threshold = 60
cv_emp$acc[7] <- round(cv_emp$correct[7]/1000, 2)
cv_emp$pre[7] <- round(cv_emp$correct_m[7]/(cv_emp$correct_m[7]+cv_emp$incorrect_m[7]), 2)
cv_emp$rec[7] <- round(cv_emp$correct_m[7]/(cv_emp$correct_m[7]+cv_emp$incorrect_nm[7]), 2)
cv_emp$f_1[7] <- round(2*((cv_emp$pre[7]*cv_emp$rec[7])/(cv_emp$pre[7]+cv_emp$rec[7])), 2)


# Threshold = 50
cv_emp$acc[8] <- round(cv_emp$correct[8]/1000, 2)
cv_emp$pre[8] <- round(cv_emp$correct_m[8]/(cv_emp$correct_m[8]+cv_emp$incorrect_m[8]), 2)
cv_emp$rec[8] <- round(cv_emp$correct_m[8]/(cv_emp$correct_m[8]+cv_emp$incorrect_nm[8]), 2)
cv_emp$f_1[8] <- round(2*((cv_emp$pre[8]*cv_emp$rec[8])/(cv_emp$pre[8]+cv_emp$rec[8])), 2) # note that R rounds to the nearest even number, hence 625/1000 is rounded to 0.62 instead of 0.63, while 635 would be rounded up to 0.64

# as characters, so that these columns are as integers when printed to table
cv_emp$correct <- as.character(cv_emp$correct)
cv_emp$correct_m <- as.character(cv_emp$correct_m)
cv_emp$incorrect_m <- as.character(cv_emp$incorrect_m)
cv_emp$incorrect_nm <- as.character(cv_emp$incorrect_nm)



# Table Column Names for Output (need to adjust multirow column names manually)
colnames(cv_emp) <- c("Threshold",
                      "Correct Overall", "Correctly Matched (TP)", "Incorrectly Matched (FP)", "Incorrectly Not Matched (FN)",
                      "Accuracy (Frac. Correct)", "Precision TP/(TP+FP)", "Recall TP(TP+FN)", "F1")


print(xtable::xtable(cv_emp, caption = "Cross-Validation of 1000 Employer and Occupation Names. This Table shows the quality of matched firm and occupation codes to employer and occupation
names, using a random sample of 1000 employer names and a random sample of 100 occupation
names. The top row for occupations and employers shows the specification used for this paper. While
the chosen threshold for employer names is a good compromise between accuracy, precision, and
recall, the threshold for occupations puts more emphasis on precision, i.e. matching only occupation
names to codes when it is very likely to be a good match."),
      file = "table_A7.tex", append=FALSE,caption.placement="bottom",
      sanitize.colnames.function = bold, include.rownames=FALSE)

rm(cv_emp, ind_naics_soc, naics_2012, naics2, naics2don, naics3dig, soc2, soc2d, soc2don, bold) # clean up






#############################################
########### Appendix Regressions ############
#############################################

setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
load("app_results_ind_match_60.RDta") # data lower threshold for matching of occupations (cosine similarity = 60)


# Table A8: CEO Models with lower threshold for matching of occupations (cosine similarity = 60)
lm_ceo1 <- felm(Align ~  CEO  | 0 | 0 | gvkey, data = pc_don)
lm_ceo2 <- felm(Align ~  CEO + emp + capx + ppent + cogs + sale | 0 | 0 | gvkey, data = pc_don)
lm_ceo3 <- felm(Align ~  CEO + emp + capx + ppent + cogs + sale | cycle  | 0 | gvkey, data = pc_don)
lm_ceo4 <- felm(Align ~  CEO + emp + capx + ppent + cogs + sale | cycle + gvkey  | 0 | gvkey, data = pc_don)
lm_ceo5 <- felm(Align ~  CEO + emp + capx + ppent + cogs + sale | cycle + gvkey + fips | 0 | gvkey, data = pc_don)
lm_ceo6 <- felm(Align ~  CEO + emp + capx + ppent + cogs + sale | cycle + gvkey + fips + soc2 | 0 | gvkey, data = pc_don)


stargazer::stargazer(lm_ceo1, lm_ceo2, lm_ceo3, lm_ceo4, lm_ceo5, lm_ceo6, type = "latex", title = "Regression Results: Chief Executives and Alignment", label = "lm_ceo1_occ60",
                     out = "table_A8.tex",
                     covariate.labels = c("CEO"), dep.var.labels = rep(c("Align"),1), omit = c("emp", "capx", "ppent", "cogs", "sale", "Constant"),
                     add.lines = list(c("Firm Controls",  "", "$\\checkmark$", "$\\checkmark$", "$\\checkmark$", "$\\checkmark$", "$\\checkmark$"),
                                      c("Cycle FEs",  "", "", "$\\checkmark$", "$\\checkmark$", "$\\checkmark$", "$\\checkmark$"),
                                      c("Firm FEs",  "", "", "","$\\checkmark$", "$\\checkmark$",  "$\\checkmark$"),
                                      c("County FEs", "", "", "", "", "$\\checkmark$", "$\\checkmark$"),
                                      c("SOC 2-digit FEs",  "", "", "", "", "", "$\\checkmark$")),omit.stat = c("rsq", "ser"),
                     column.sep.width = "3pt", notes = "Standard errors clustered by firm.", notes.align = "l")

rm(lm_ceo1, lm_ceo2, lm_ceo3, lm_ceo4, lm_ceo5, lm_ceo6, pc_don)



# Table A9: Individual Republican Share and Company Republican Share while excluding extreme values, 0 and 1
load("app_results_firm.RDta")

lm_RS1 <- felm(Rshare_i ~ Rshare_c  | 0 | 0 | gvkey, data = ic_RS)
lm_RS2 <- felm(Rshare_i ~ Rshare_c + emp + capx + ppent + cogs + sale | 0 | 0 | gvkey, data = ic_RS)
lm_RS3 <- felm(Rshare_i ~ Rshare_c + emp + capx + ppent + cogs + sale | cycle | 0 | gvkey, data = ic_RS)
lm_RS4 <- felm(Rshare_i ~ Rshare_c + emp + capx + ppent + cogs + sale | cycle + gvkey  | 0 | gvkey, data = ic_RS)


stargazer::stargazer(lm_RS1, lm_RS2, lm_RS3, lm_RS4, type = "latex", title = "Regression Results: Employee and Firm Donations, excluding extreme values", label = "lm_RS1",
                     out = "table_A9.tex",
                     covariate.labels = c("Firm REP Donation Share"), dep.var.labels = rep(c("Employee REP Donation Share"),1), omit = c("emp", "capx", "ppent", "cogs", "sale", "Constant"),
                     add.lines = list(c("Firm Controls",  "", "$\\checkmark$", "$\\checkmark$", "$\\checkmark$"),
                                      c("Cycle FEs",  "", "", "$\\checkmark$", "$\\checkmark$"),
                                      c("Firm FEs",  "", "", "","$\\checkmark$")),omit.stat = c("rsq", "ser"),
                     column.sep.width = "3pt", notes = "Standard errors clustered by firm.", notes.align = "l")

rm(lm_RS1, lm_RS2, lm_RS3, lm_RS4, ic_RS)






# Table A10: Employees and Alignment while exluding extreme values, 0 and 1
load("app_results_ind.RDta")


library(lfe)
lm_emp1 <- felm(Align ~  emp | 0   | 0 | gvkey, data = pc_don)
lm_emp2 <- felm(Align ~  emp + capx + ppent + cogs + sale | 0  | 0 | gvkey, data = pc_don)
lm_emp3 <- felm(Align ~  emp + capx + ppent + cogs + sale | cycle  | 0 | gvkey, data = pc_don)
lm_emp4 <- felm(Align ~  emp + capx + ppent + cogs + sale | cycle + soc_2010  | 0 | gvkey, data = pc_don)
lm_emp5 <- felm(Align ~  emp + capx + ppent + cogs + sale | cycle + soc_2010 + fips  | 0 | gvkey, data = pc_don)
lm_emp6 <- felm(Align ~  emp + capx + ppent + cogs + sale | cycle + soc_2010 + fips + naics2  | 0 | gvkey, data = pc_don)


stargazer::stargazer(lm_emp1, lm_emp2, lm_emp3, lm_emp4, lm_emp5, lm_emp6, type = "latex", title = "Regression Results: Firm Size and Alignment, excluding extreme values", label = "lm_emp1",
                     out = "table_A10.tex",
                     covariate.labels = c("log(Employees)"), dep.var.labels = rep(c("Align"),1), omit = c("capx", "ppent", "cogs", "sale", "Constant"),
                     add.lines = list(c("Firm Controls",  "", "$\\checkmark$", "$\\checkmark$", "$\\checkmark$", "$\\checkmark$", "$\\checkmark$"),
                                      c("Cycle FEs",  "", "", "$\\checkmark$", "$\\checkmark$", "$\\checkmark$", "$\\checkmark$"),
                                      c("Occupation FEs",  "", "", "", "$\\checkmark$", "$\\checkmark$", "$\\checkmark$"),
                                      c("County FEs", "", "", "", "", "$\\checkmark$", "$\\checkmark$"),
                                      c("NAICS 2-digit FEs", "", "", "", "", "", "$\\checkmark$")),omit.stat = c("rsq", "ser"),
                     column.sep.width = "3pt", notes = "Standard errors clustered by firm.", notes.align = "l")

rm(lm_emp1, lm_emp2, lm_emp3, lm_emp4, lm_emp5, lm_emp6, pc_don) # clean up




# Table A11: CEO/Executives and Alignment while exluding extreme values, 0 and 1
load("app_results_ind.RDta")


lm_ceo1 <- felm(Align ~  CEO  | 0 | 0 | gvkey, data = pc_don)
lm_ceo2 <- felm(Align ~  CEO + emp + capx + ppent + cogs + sale | 0 | 0 | gvkey, data = pc_don)
lm_ceo3 <- felm(Align ~  CEO + emp + capx + ppent + cogs + sale | cycle  | 0 | gvkey, data = pc_don)
lm_ceo4 <- felm(Align ~  CEO + emp + capx + ppent + cogs + sale | cycle + gvkey  | 0 | gvkey, data = pc_don)
lm_ceo5 <- felm(Align ~  CEO + emp + capx + ppent + cogs + sale | cycle + gvkey + fips | 0 | gvkey, data = pc_don)
lm_ceo6 <- felm(Align ~  CEO + emp + capx + ppent + cogs + sale | cycle + gvkey + fips + soc2 | 0 | gvkey, data = pc_don)


stargazer::stargazer(lm_ceo1, lm_ceo2, lm_ceo3, lm_ceo4, lm_ceo5, lm_ceo6, type = "latex", title = "Regression Results: Chief Executives and Alignment, excluding extreme values", label = "lm_ceo1",
                     out = "table_A11.tex",
                     covariate.labels = c("CEO"), dep.var.labels = rep(c("Align"),1), omit = c("emp", "capx", "ppent", "cogs", "sale", "Constant"),
                     add.lines = list(c("Firm Controls",  "", "$\\checkmark$", "$\\checkmark$", "$\\checkmark$", "$\\checkmark$", "$\\checkmark$"),
                                      c("Cycle FEs",  "", "", "$\\checkmark$", "$\\checkmark$", "$\\checkmark$", "$\\checkmark$"),
                                      c("Firm FEs",  "", "", "","$\\checkmark$", "$\\checkmark$",  "$\\checkmark$"),
                                      c("County FEs", "", "", "", "", "$\\checkmark$", "$\\checkmark$"),
                                      c("SOC 2-digit FEs",  "", "", "", "", "", "$\\checkmark$")),omit.stat = c("rsq", "ser"),
                     column.sep.width = "3pt", notes = "Standard errors clustered by firm.", notes.align = "l")

rm(lm_ceo1, lm_ceo2, lm_ceo3, lm_ceo4, lm_ceo5, lm_ceo6, pc_don)





############################
############################
############################


# close log file
sink()
